Analysis of the UK Biobank neuroimaging data to build a model of brain ageing using multiple neuroimaging modalities. James Cole, 2nd October 2019
The data were initially read using Ken Hanscombe’s ukbtools package (ukb_df tool), which reads the .tab file, .r script and .html. This is slow process for large files, hence here the data are read and than saved as .rda files, which can be loaded more swiftly for convenience.
# clear workspace
rm(list = ls())
# Load packages
library(boot)
library(corrplot)
library(cowplot)
library(dplyr)
library(ggplot2)
library(glmnet)
library(heplots)
library(hier.part)
library(Hmisc)
library(kableExtra)
library(knitr)
library(lm.beta)
library(psych)
library(pwr)
library(tidyverse)
library(ukbtools)
Looks for existing file and loads is available.
if (file.exists("../ukb_data.rda")) {
cat("Loading data file", date(),sep = " ")
load(file = "../ukb_data.rda")
} else {
tmp <- ukb_df("ukb23892", path = "/Volumes/home/analysis/UKBiobank/ID_23892")
tmp1 <- ukb_df("ukb26571", path = "/Volumes/home/analysis/UKBiobank/ID_26571")
tmp2 <- read.table("../rs_fMRI_data.txt", colClasses = c("eid" = "character"))
df <- list(tmp, tmp1, tmp2) %>% reduce(left_join, by = "eid") # using purr
save(df, file = "../ukb_data.rda")
rm(list = ls(pattern = "tmp*"))
}
## Loading data file Fri Oct 18 14:57:24 2019
Data from n = 22392 UK Biobank participants were downloaded for the study.
Field 12188 uses coding 0=no, 1=yes, -1=unknown. Subset to only include those who completed brain MRI from imaging assessment.
df$brain_mri_measurement_completeduses_datacoding_21_f12188_2_0 <- as.factor(df$brain_mri_measurement_completeduses_datacoding_21_f12188_2_0)
table(df$brain_mri_measurement_completeduses_datacoding_21_f12188_2_0)
##
## -1 0 1
## 49 2025 20237
df <- subset(df, df$brain_mri_measurement_completeduses_datacoding_21_f12188_2_0 == 1)
df[,grep("age_when_attended", names(df))] <- lapply(df[,grep("age_when_attended", names(df))], as.numeric)
df$sex <- factor(df$sexuses_datacoding_9_f31_0_0)
df$sex <- dplyr::recode(df$sex, "0" = "Female", "1" = "Male")
df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 <- factor(df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0)
df$stroke_history <- factor(!is.na(df$age_stroke_diagnoseduses_datacoding_100291_f4056_2_0) & as.numeric(df$age_stroke_diagnoseduses_datacoding_100291_f4056_2_0) > 0, labels = c("no stroke", "stroke"))
var.list <- c("body_mass_index_bmi_f21001_2_0",
"diastolic_blood_pressure_automated_reading_f4079_2_0",
"diastolic_blood_pressure_automated_reading_f4079_2_1",
"systolic_blood_pressure_automated_reading_f4080_2_0",
"systolic_blood_pressure_automated_reading_f4080_2_1",
"hip_circumference_f49_2_0",
"weight_f21002_2_0",
"hand_grip_strength_left_f46_2_0",
"hand_grip_strength_right_f47_2_0",
"overall_health_ratinguses_datacoding_100508_f2178_2_0",
"longstanding_illness_disability_or_infirmityuses_datacoding_100349_f2188_2_0",
"height_f12144_2_0",
"mean_tfmri_head_motion_averaged_across_space_and_time_points_f25742_2_0",
"mean_rfmri_head_motion_averaged_across_space_and_time_points_f25741_2_0",
"volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0",
"fluid_intelligence_score_f20016_2_0",
"duration_to_complete_numeric_path_trail_1uses_datacoding_1990_f6348_2_0",
"duration_to_complete_alphanumeric_path_trail_2uses_datacoding_1990_f6350_2_0",
"number_of_puzzles_correctly_solved_f6373_2_0",
"duration_spent_answering_each_puzzle_f6333_2_0",
"number_of_puzzles_correct_f6382_2_0",
"duration_of_moderate_activityuses_datacoding_100291_f894_2_0",
"duration_of_vigorous_activityuses_datacoding_100291_f914_2_0")
df[,var.list] <- lapply(df[,var.list], as.numeric)
imaging.vars.list <- grep("forced|nifti|discrepancy", grep("volume|t2star|t2_flair|bold|activation|skeleton|tract|Partial_corr_25_dim", names(df), value = T), invert = T, value = T)
df[,grep("volume", names(df))] <- lapply(df[,grep("volume", names(df))], as.numeric)
df[,grep("t2star", names(df))] <- lapply(df[,grep("t2star", names(df))], as.numeric)
df[,grep("t2_flair", names(df))] <- lapply(df[,grep("t2_flair", names(df))], as.numeric)
## Warning in lapply(df[, grep("t2_flair", names(df))], as.numeric): NAs
## introduced by coercion
df[,grep("bold", names(df))] <- lapply(df[,grep("bold", names(df))], as.numeric)
df[,grep("skeleton", names(df))] <- lapply(df[,grep("skeleton", names(df))], as.numeric)
df[,grep("tract", names(df))] <- lapply(df[,grep("tract", names(df))], as.numeric)
df[,grep("activation", names(df))] <- lapply(df[,grep("activation", names(df))], as.numeric)
write.csv(file = "ukb_imaging_vars_list.csv", x = imaging.vars.list, quote = F, row.names = F,col.names = F)
## Warning in write.csv(file = "ukb_imaging_vars_list.csv", x =
## imaging.vars.list, : attempt to set 'col.names' ignored
Instance 0 is the baseline demographic clinical visit from 2006-2010. Instance 1 is a follow-up from 2012, no imaging then. Exclude people without complete imaging data.
table(complete.cases(df[,c(imaging.vars.list)]))
##
## FALSE TRUE
## 2776 17461
df <- df[complete.cases(df[,c(imaging.vars.list)]),]
df$age_at_scan <- df$age_when_attended_assessment_centre_f21003_2_0
describe(df$age_at_scan)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 17461 62.46 7.42 63 62.57 8.9 45 80 35 -0.14 -0.89
## se
## X1 0.06
table(df$sex)
##
## Female Male
## 9274 8187
hist(df$age_at_scan, breaks = 25, col = "darkgoldenrod2", xlab = 'Age at scan (years)')
Function to generate a data.frame with TRUE or FALSE for the presence/absence of ICD diagnosis. Adapted from Ken’s ukb_icd_diagnosis() function.
has_icd.diagnosis <- function(id, icd.version){
x <- df %>% dplyr::filter(eid %in% id) %>%
dplyr::select(matches(paste("^diagnoses.*icd", icd.version, sep = ""))) %>%
dplyr::select_if(colSums(!is.na(.)) > 0) %>% ncol() != 0
return(data.frame(id, x))
}
Running on the whole dataset takes >4 hours for n>20,000. So load .RDA file if avaialble.
if (file.exists("../ukb_icd.diagnosis_data.rda")) {
load("../ukb_icd.diagnosis_data.rda")
} else {
has_icd.diagnosis.df <- do.call(rbind, lapply(df$eid, function(x) has_icd.diagnosis(x, 10)))
save(has_icd.diagnosis.df, file = "../ukb_icd.diagnosis_data.rda")
}
Merge to add health status variable.
names(has_icd.diagnosis.df) <- c("eid", "icd_positive")
df <- merge(df, has_icd.diagnosis.df)
table(df$icd_positive)
##
## FALSE TRUE
## 4708 12753
Check number of ICD-positive against subjective health and longstanding illness.
table(df$overall_health_ratinguses_datacoding_100508_f2178_2_0, df$icd_positive)
##
## FALSE TRUE
## -3 0 3
## -1 3 16
## 1 1180 2141
## 2 3018 7963
## 3 436 2244
## 4 36 307
table(df$longstanding_illness_disability_or_infirmityuses_datacoding_100349_f2188_2_0, df$icd_positive)
##
## FALSE TRUE
## -3 1 14
## -1 40 196
## 0 3899 8723
## 1 733 3741
Include people with no ICD-10 diagnosis, no self-reported long-standing illness disability or infirmity (F2188) and good or excellent self-reported health (F2178). Remove subjects with NAs in the age column and with the missing imaging variables.
df$healthy <- ifelse(df$icd_positive == "FALSE" & df$longstanding_illness_disability_or_infirmityuses_datacoding_100349_f2188_2_0 == 0 & df$overall_health_ratinguses_datacoding_100508_f2178_2_0 >= 2 & df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 == 0 & df$stroke_history != "stroke", "healthy", "non-healthy")
healthy.df <- subset(df, df$healthy == "healthy")
table(is.na(df$healthy))
##
## FALSE TRUE
## 17426 35
Run descriptive statistics
describeBy(df$age_at_scan, df$healthy)
##
## Descriptive statistics by group
## group: healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2725 61.47 7.21 62 61.48 8.9 46 79 33 -0.02 -0.9
## se
## X1 0.14
## --------------------------------------------------------
## group: non-healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 14701 62.64 7.45 63 62.78 8.9 45 80 35 -0.16 -0.88
## se
## X1 0.06
table(df$sex, df$healthy)
##
## healthy non-healthy
## Female 1343 7914
## Male 1382 6787
round(prop.table(table(df$sex, df$healthy), 2),3)
##
## healthy non-healthy
## Female 0.493 0.538
## Male 0.507 0.462
ggplot(subset(df, !is.na(df$healthy)), aes(age_at_scan, color = healthy)) +
geom_density(aes(fill = healthy), alpha = 0.5) +
xlab("Age at MRI scan (years)") +
theme_bw()
Lots of missing data here, around 50% NAs Coding here: http://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=1001
table(is.na(df$ethnic_backgrounduses_datacoding_1001_f21000_2_0))
##
## FALSE TRUE
## 8563 8898
table(df$ethnic_backgrounduses_datacoding_1001_f21000_2_0, df$healthy)
##
## healthy non-healthy
## -1 1 1
## -3 3 9
## 1 1 1
## 1001 1236 6694
## 1002 27 188
## 1003 33 144
## 2 0 1
## 2001 2 11
## 2002 1 4
## 2003 1 7
## 2004 5 12
## 3001 11 46
## 3002 1 13
## 3003 1 0
## 3004 2 11
## 4001 1 23
## 4002 2 16
## 5 4 17
## 6 5 28
round(100*prop.table(table(df$ethnic_backgrounduses_datacoding_1001_f21000_2_0, df$healthy), 2),2)
##
## healthy non-healthy
## -1 0.07 0.01
## -3 0.22 0.12
## 1 0.07 0.01
## 1001 92.45 92.64
## 1002 2.02 2.60
## 1003 2.47 1.99
## 2 0.00 0.01
## 2001 0.15 0.15
## 2002 0.07 0.06
## 2003 0.07 0.10
## 2004 0.37 0.17
## 3001 0.82 0.64
## 3002 0.07 0.18
## 3003 0.07 0.00
## 3004 0.15 0.15
## 4001 0.07 0.32
## 4002 0.15 0.22
## 5 0.30 0.24
## 6 0.37 0.39
BMI
by(df$body_mass_index_bmi_f21001_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 2663 26.44 4.16 25.94 26.12 3.74 16.4 58.04 41.65 1.05 2.77
## se Q0.25 Q0.75
## 1 0.08 23.58 28.68
## --------------------------------------------------------
## df$healthy: non-healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 14348 26.6 4.41 25.94 26.22 3.87 13.39 55.23 41.84 1.05 2.18
## se Q0.25 Q0.75
## 1 0.04 23.56 28.88
Weight
by(df$weight_f21002_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 2665 76.3 14.8 75 75.57 14.53 41.6 179.8 138.2 0.73 1.81
## se Q0.25 Q0.75
## 1 0.29 65.6 85.4
## --------------------------------------------------------
## df$healthy: non-healthy
## vars n mean sd median trimmed mad min max range skew
## 1 1 14382 76.14 15.13 74.6 75.15 14.68 33 169.2 136.2 0.79
## kurtosis se Q0.25 Q0.75
## 1 1.27 0.13 65.2 85
Hip circumference
by(df$hip_circumference_f49_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 2667 100.98 8.19 100 100.49 7.41 80 152 72 0.83 1.92
## se Q0.25 Q0.75
## 1 0.16 95.5 105
## --------------------------------------------------------
## df$healthy: non-healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 14400 101.39 8.75 100 100.77 7.41 68 156 88 0.97 2.36
## se Q0.25 Q0.75
## 1 0.07 96 106
print("Diastolic")
## [1] "Diastolic"
by(df$diastolic_blood_pressure_automated_reading_f4079_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 2254 79.67 10.61 79 79.52 10.38 43 114 71 0.14 -0.03
## se Q0.25 Q0.75
## 1 0.22 72 87
## --------------------------------------------------------
## df$healthy: non-healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 12097 78.35 10.53 78 78.1 10.38 38 128 90 0.27 0.17
## se Q0.25 Q0.75
## 1 0.1 71 85
print("Systolic")
## [1] "Systolic"
by(df$systolic_blood_pressure_automated_reading_f4080_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 2254 139.55 18.95 139 138.86 19.27 81 212 131 0.37 0.06
## se Q0.25 Q0.75
## 1 0.4 126 151.75
## --------------------------------------------------------
## df$healthy: non-healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 12097 138.43 18.94 137 137.7 19.27 78 228 150 0.43 0.38
## se Q0.25 Q0.75
## 1 0.17 125 150
table(df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0, df$healthy)
##
## healthy non-healthy
## -1 0 24
## -3 0 15
## 0 2725 13747
## 1 0 836
round(100*prop.table(table(df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0, df$healthy), 2),2)
##
## healthy non-healthy
## -1 0.00 0.16
## -3 0.00 0.10
## 0 100.00 94.02
## 1 0.00 5.72
table(df$stroke_history, df$healthy)
##
## healthy non-healthy
## no stroke 2725 14500
## stroke 0 201
round(100*prop.table(table(df$stroke_history, df$healthy), 2),2)
##
## healthy non-healthy
## no stroke 100.00 98.63
## stroke 0.00 1.37
Use 80% for training and 20% for validation/testing.
# Determine sample size and assign training/test group variables
set.seed(1982)
index <- sample(2, nrow(healthy.df), replace = TRUE, prob = c(0.8, 0.2))
table(index)
## index
## 1 2
## 2205 520
# Split data into training/testing and keep imaging variables only
train_data <- healthy.df[index == 1, imaging.vars.list]
test_data <- healthy.df[index == 2, imaging.vars.list]
# Define objects with age labels for training and test sets
train_labels <- healthy.df[index == 1, "age_at_scan"]
test_labels <- healthy.df[index == 2, "age_at_scan"]
describe(train_labels)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2205 61.54 7.24 62 61.54 8.9 46 79 33 -0.03 -0.89
## se
## X1 0.15
describe(test_labels)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 520 61.2 7.07 61 61.2 8.9 47 78 31 0 -0.97 0.31
par(mfrow = c(1,2))
hist(train_labels, breaks = 20, col = "darkgoldenrod2", main = "Training set age", xlab = "Age at scan (years)")
hist(test_labels, breaks = 20, col = "darkgoldenrod2", main = "Validation set age", xlab = "Age at scan (years)")
par(mfrow = c(1,1))
This is essential for ANN and probably a good idea for all the models.
scaled.train_data <- scale(train_data, scale = TRUE, center = TRUE)
scaling.parameters.center <- attr(scaled.train_data, "scaled:center")
scaling.parameters.scale <- attr(scaled.train_data, "scaled:scale")
scaled.test_data <- as.data.frame(scale(test_data, scaling.parameters.center, scaling.parameters.scale))
scaled.train_data <- as.data.frame(scaled.train_data)
Plot of n=1079 variables using the index order (which is basically arbitrary). With n = 2725, very small r values will be significant at 0.05. Bonferroni adjusted pvalues need to be below 0.05/1079 = 4.633920310^{-5}.
alpha <- 0.05
power <- pwr.r.test(n = length(healthy.df$age_at_scan), sig.level = alpha, power = 0.8, alternative = "greater")
power.bonf <- pwr.r.test(n = length(healthy.df$age_at_scan), sig.level = alpha/length(imaging.vars.list), power = 0.8, alternative = "greater")
plot(sapply(healthy.df[imaging.vars.list], function(x) cor(x, healthy.df$age_at_scan)),
type = "h",
col = "darkgoldenrod2",
ylab = "Pearson's r with age");abline(h = 0);abline(h = power$r, lty = "dashed", col = "grey40");abline(h = (0 - power$r), lty = "dashed", col = "grey40"); abline(h = power.bonf$r, col = "darkred", lty = 2); abline(h = (0 - power.bonf$r), col = "darkred", lty = 2)
text(x = 0, y = 0.065, "Uncorrected p = 0.05", col = "grey40", adj = 0, cex = 0.8)
text(x = 0, y = 0.13, "Bonferroni p = 0.05", col = "darkred", adj = 0, cex = 0.8)
Of the univariate correlations with age, 886 are significant at p = 0.05. When using Bonferroni correction 704 are significant at p = 0.05.
Take predicted age values as input.
test_results <- function(pred) {
r <- cor.test(test_labels, pred)$estimate
r.sq <- summary(lm(test_labels ~ pred))$r.squared
MAE <- mean(abs(pred - test_labels), na.rm = T)
age.bias <- cor.test(test_labels, (pred - test_labels))$estimate
value <- sapply(c(r,r.sq, MAE, age.bias), function(x) round(x, 3))
results <- cbind(c("r", "R^2", "MAE", "Age.bias"), value)
return(kable(results) %>% kable_styling())
}
age_plot <- function(pred) {
qplot(x = test_labels, y = pred) +
geom_abline(slope = 1, intercept = 0) +
geom_point(shape = 21, bg = "darkgoldenrod2", size = 2) +
geom_smooth(method = "lm", col = "darkgrey") +
xlab("Age (years)") +
ylab(deparse(substitute(pred))) +
theme_bw()
}
Using the glmnet package. Alpha = 1 is for LASSO penalisation (0 = ridge, 0.5 = elastic net).
x.train <- as.matrix(scaled.train_data)
dimnames(x.train) <- NULL
y.train <- as.matrix(train_labels)
## cross-validation for lambda
lasso.fit.cv <- cv.glmnet(x = x.train, y = y.train,
alpha = 1, family = "gaussian")
Plot results. The minimum lambda value is 0.055, while the optimal lambda value (i.e., the highest value within 1 standard error of the minimum) is 0.105.
plot(lasso.fit.cv)
## fit model using optimal lambda value (1 SE value, not minimum)
lasso.fit <- glmnet(x = x.train, y = y.train,
alpha = 1, family = "gaussian", lambda = lasso.fit.cv$lambda.1se)
lasso.pred <- predict(lasso.fit, newx = as.matrix(scaled.test_data))
test_results(lasso.pred)
| value | ||
|---|---|---|
| cor | r | 0.786 |
| R^2 | 0.618 | |
| MAE | 3.515 | |
| cor | Age.bias | -0.654 |
age_plot(lasso.pred)
ggsave("~/Work/Articles/Brain age/UK Biobank multi-modal brain age/brain_age_scatterplot.pdf", useDingbats = FALSE, dpi = 75, height = 4, width = 4)
LASSO.coefficient <- coef(lasso.fit, s = lasso.fit.cv$lambda.1se)[-1]
var.coefs <- data.frame(imaging.vars.list, LASSO.coefficient)
non.zero_vars <- subset(var.coefs, var.coefs$LASSO.coefficient != 0)
non.zero_vars$imaging.vars.list <- factor(non.zero_vars$imaging.vars.list)
Out of the original 1079 variables, the LASSO regression set 221 to non-zero, thus 858 variables were removed.
Bootstrap 95% confidence intervals. Uses the boot package.
Essential to convert coefficients to vector that stores zeros.
lasso.coef <- function(data, indices) {
d <- data[indices,]
fit <- glmnet(x = d[,-1], y = d[,1],
alpha = 1, family = "gaussian", lambda = lasso.fit.cv$lambda.1se)
return(coef(fit)[,1])
}
Normal printing and plotting of results doesn’t work for high-dimensional datasets. Load data file if it already exists.
if (file.exists("../lasso.boot.out.rda")) {
load("../lasso.boot.out.rda")
} else {
cat("running bootstraps")
boot.out <- boot(data = cbind(y.train, x.train), statistic = lasso.coef, R = 1000)
save(boot.out, file = "../lasso.boot.out.rda")
}
There were 221 non-zero coefficients.
Check histogram of bootstrap coefficients for top variable by way of example.
hist(boot.out$t[,which.max(abs(boot.out$t0[-1])) + 1], breaks = 100, col = "darkgoldenrod2")
ci.vector <- function(index, boot.object, ci.type) {
x <- boot.ci(boot.object, type = ci.type, index = index)
return(x[4])
}
Use my ci.vector() function (defined above) to derive confidence intervals.
n <- length(boot.out$t0)
boot.ci.out <- sapply(1:n, ci.vector, boot.object = boot.out, ci.type = "basic")
x <- boot.out$t0[1:n]
y <- data.frame(t(matrix(unlist(boot.ci.out), ncol = n)))[4:5]
ci.df <- cbind(x, y)
names(ci.df) <- c("coef", "l.ci", "u.ci")
Identify variables with confidence intervals that do not overlap zero.
# drop intercept from plot using [-1] in vector ci.df$l.ci and ci.df$u.ci (i.e., the intercept is the top row)
sig.vars.index <- which(ci.df$l.ci[-1] > 0 | ci.df$u.ci[-1] < 0)
sig.vars.list <- imaging.vars.list[sig.vars.index]
sig.vars.df <- ci.df[sig.vars.index + 1,] ## add 1 to omit intercept row
sig.vars.df <- cbind(sig.vars.list, round(sig.vars.df,3))
kable(sig.vars.df[order(abs(sig.vars.df$coef), decreasing = T),]) %>% kable_styling()
| sig.vars.list | coef | l.ci | u.ci | |
|---|---|---|---|---|
| V6 | volume_of_grey_matter_normalised_for_head_size_f25005_2_0 | -1.466 | -1.851 | -1.242 |
| V800 | weightedmean_icvf_in_tract_forceps_minor_f25661_2_0 | -1.029 | -1.789 | -0.699 |
| V26 | volume_of_brain_stem_4th_ventricle_f25025_2_0 | 0.692 | 0.484 | 1.139 |
| V164 | volume_of_grey_matter_in_ventral_striatum_left_f25890_2_0 | -0.666 | -1.022 | -0.393 |
| V711 | weightedmean_l1_in_tract_anterior_thalamic_radiation_right_f25572_2_0 | 0.513 | 0.218 | 1.026 |
| V584 | mean_isovf_in_fornix_on_fa_skeleton_f25445_2_0 | 0.501 | 0.219 | 1.003 |
| V195 | mean_fa_in_middle_cerebellar_peduncle_on_fa_skeleton_f25056_2_0 | -0.477 | -0.910 | -0.275 |
| V157 | volume_of_grey_matter_in_putamen_right_f25883_2_0 | 0.460 | 0.262 | 0.911 |
| V13 | volume_of_thalamus_right_f25012_2_0 | -0.454 | -0.909 | -0.116 |
| V210 | mean_fa_in_cerebral_peduncle_on_fa_skeleton_left_f25071_2_0 | -0.453 | -0.906 | -0.226 |
| V208 | mean_fa_in_superior_cerebellar_peduncle_on_fa_skeleton_left_f25069_2_0 | 0.429 | 0.053 | 0.859 |
| V339 | mean_l1_in_middle_cerebellar_peduncle_on_fa_skeleton_f25200_2_0 | -0.428 | -0.815 | -0.162 |
| V463 | mean_l3_in_posterior_thalamic_radiation_on_fa_skeleton_right_f25324_2_0 | 0.418 | 0.129 | 0.835 |
| V426 | mean_l2_in_fornix_cresstria_terminalis_on_fa_skeleton_left_f25287_2_0 | 0.393 | 0.062 | 0.787 |
| V486 | mean_icvf_in_body_of_corpus_callosum_on_fa_skeleton_f25347_2_0 | 0.381 | 0.031 | 0.762 |
| V356 | mean_l1_in_anterior_limb_of_internal_capsule_on_fa_skeleton_left_f25217_2_0 | 0.349 | 0.095 | 0.698 |
| V330 | mean_mo_in_fornix_cresstria_terminalis_on_fa_skeleton_left_f25191_2_0 | -0.336 | -0.610 | -0.061 |
| V173 | volume_of_grey_matter_in_vi_cerebellum_right_f25899_2_0 | -0.330 | -0.661 | -0.059 |
| V137 | volume_of_grey_matter_in_frontal_operculum_cortex_right_f25863_2_0 | -0.321 | -0.582 | -0.146 |
| V572 | mean_od_in_superior_longitudinal_fasciculus_on_fa_skeleton_left_f25433_2_0 | 0.316 | 0.169 | 0.633 |
| V178 | volume_of_grey_matter_in_vermis_crus_ii_cerebellum_f25904_2_0 | 0.306 | 0.115 | 0.532 |
| V787 | weightedmean_l3_in_tract_uncinate_fasciculus_left_f25648_2_0 | 0.302 | 0.002 | 0.603 |
| V16 | volume_of_putamen_left_f25015_2_0 | -0.284 | -0.567 | -0.011 |
| V279 | mean_md_in_cingulum_hippocampus_on_fa_skeleton_right_f25140_2_0 | -0.272 | -0.544 | -0.109 |
| V391 | mean_l2_in_splenium_of_corpus_callosum_on_fa_skeleton_f25252_2_0 | -0.271 | -0.542 | -0.014 |
| V683 | weightedmean_mo_in_tract_anterior_thalamic_radiation_left_f25544_2_0 | 0.263 | 0.065 | 0.526 |
| V48 | 90th_percentile_of_bold_effect_in_groupdefined_mask_for_shapes_activation_f25761_2_0 | 0.229 | 0.050 | 0.425 |
| V692 | weightedmean_mo_in_tract_forceps_minor_f25553_2_0 | -0.226 | -0.452 | -0.031 |
| V31 | median_t2star_in_putamen_left_f25030_2_0 | -0.221 | -0.442 | -0.016 |
| V153 | volume_of_grey_matter_in_thalamus_right_f25879_2_0 | 0.217 | 0.005 | 0.435 |
| V45 | median_bold_effect_in_groupdefined_mask_for_facesshapes_contrast_f25048_2_0 | -0.212 | -0.423 | -0.060 |
| V714 | weightedmean_l1_in_tract_parahippocampal_part_of_cingulum_left_f25575_2_0 | -0.174 | -0.348 | -0.001 |
| V1026 | Partial_corr_25_dim_25752_2_0_v157 | 0.165 | 0.024 | 0.329 |
| V421 | mean_l2_in_cingulum_cingulate_gyrus_on_fa_skeleton_right_f25282_2_0 | -0.119 | -0.237 | -0.009 |
## sort dataset by coefficient
ci.df2 <- ci.df[order(ci.df$coef, decreasing = T),]
# drop intercept from plot using [-1,] in data.frame ci.df (i.e., the intercept is the top row)
plot(ci.df2[-1,1], ylim = c(min(ci.df2[-1,2]), max(ci.df2[-1,3])),
pch = 20, col = "darkgoldenrod2", ylab = "LASSO coefficient") +
arrows(x0 = 1:(n - 1), y0 = ci.df2[-1,2], y1 = ci.df2[-1,3],
length = 0.02, angle = 90, code = 3, col = "grey") +
abline(h = 0, type = 2)
## integer(0)
There are 34 variables with CIs that don’t overlap zero.
## sort dataset by coefficient
sig.vars.df2 <- sig.vars.df[order(sig.vars.df$coef, decreasing = T),]
opar <- par()
par(mar = c(15, 4, 1, 2))
axis_labels <- gsub("_f....._2_0", "", sig.vars.df2[,1])
plot(sig.vars.df2$coef, ylim = c(min(sig.vars.df2$l.ci),max(sig.vars.df2$u.ci)),
pch = 20, col = "darkgoldenrod2", ylab = "LASSO coefficient", xaxt = "n", xlab = "") +
arrows(x0 = 1:dim(sig.vars.df2)[1], y0 = sig.vars.df2$l.ci, y1 = sig.vars.df2$u.ci,
length = 0.02, angle = 90, code = 3, col = "grey") +
abline(h = 0, type = 2)
## integer(0)
axis(side = 1, at = 1:length(sig.vars.list), labels = axis_labels, las = 2, cex.axis = 0.8)
par(opar)
top.ols <- lm(train_labels ~ .,
data = scaled.train_data[,sig.vars.index])
top.ols.pred <- predict(object = top.ols, newdata = scaled.test_data[,sig.vars.index])
test_results(top.ols.pred)
| value | ||
|---|---|---|
| cor | r | 0.734 |
| R^2 | 0.538 | |
| MAE | 3.817 | |
| cor | Age.bias | -0.59 |
age_plot(top.ols.pred)
## fit model using optimal lambda value (1 SE value, not minimum)
top.lasso.fit <- glmnet(x = x.train[,sig.vars.index], y = y.train,
alpha = 1, family = "gaussian", lambda = lasso.fit.cv$lambda.1se)
top.lasso.pred <- predict(top.lasso.fit, newx = as.matrix(scaled.test_data[,sig.vars.index]))
test_results(top.lasso.pred)
| value | ||
|---|---|---|
| cor | r | 0.735 |
| R^2 | 0.541 | |
| MAE | 3.786 | |
| cor | Age.bias | -0.663 |
age_plot(top.lasso.pred)
Using the same training/test split as above, and using the same random seed.
run_lasso <- function(list) {
train_data <- healthy.df[index == 1, list]
test_data <- healthy.df[index == 2, list]
scaled.train_data <- as.data.frame(scale(train_data))
scaled.test_data <- as.data.frame(scale(test_data))
x.train <- as.matrix(scaled.train_data)
dimnames(x.train) <- NULL
y.train <- as.matrix(train_labels)
## cross-validation for lambda
lasso.fit.cv <- cv.glmnet(x = x.train, y = y.train, alpha = 1, family = "gaussian")
lasso.fit <- glmnet(x = x.train, y = y.train, alpha = 1, family = "gaussian", lambda = lasso.fit.cv$lambda.1se)
lasso.pred <- predict(lasso.fit, newx = as.matrix(scaled.test_data))
dimnames(lasso.pred)[[2]] <- deparse(substitute(list))
# return(test_results(lasso.pred))
return(lasso.pred)
}
t1.vars.list <- grep("forced|nifti|discrepancy|t2_flair", grep("volume", names(df), value = T), invert = T, value = T)
t1.pred <- run_lasso(t1.vars.list)
test_results(t1.pred)
| value | ||
|---|---|---|
| cor | r | 0.685 |
| R^2 | 0.47 | |
| MAE | 4.132 | |
| cor | Age.bias | -0.716 |
Only one variable, just use OLS regression.
t2.ols <- lm(train_labels ~ total_volume_of_white_matter_hyperintensities_from_t1_and_t2_flair_images_f25781_2_0,
data = scaled.train_data)
t2.ols.pred <- predict(object = t2.ols, newdata = scaled.test_data)
test_results(t2.ols.pred)
| value | ||
|---|---|---|
| cor | r | 0.308 |
| R^2 | 0.095 | |
| MAE | 5.653 | |
| cor | Age.bias | -0.942 |
t2star.vars.list <- grep("forced|nifti|discrepancy", grep("t2star", names(df), value = T), invert = T, value = T)
t2star.pred <- run_lasso(t2star.vars.list)
test_results(t2star.pred)
| value | ||
|---|---|---|
| cor | r | 0.329 |
| R^2 | 0.108 | |
| MAE | 5.78 | |
| cor | Age.bias | -0.987 |
diffusion.vars.list <- grep("forced|nifti|discrepancy", grep("skeleton|tract", names(df), value = T), invert = T, value = T)
diffusion.pred <- run_lasso(diffusion.vars.list)
test_results(diffusion.pred)
| value | ||
|---|---|---|
| cor | r | 0.732 |
| R^2 | 0.536 | |
| MAE | 3.888 | |
| cor | Age.bias | -0.628 |
task.vars.list <- grep("forced|nifti|discrepancy", grep("bold|activation", names(df), value = T), invert = T, value = T)
task.pred <- run_lasso(task.vars.list)
test_results(task.pred)
| value | ||
|---|---|---|
| cor | r | 0.165 |
| R^2 | 0.027 | |
| MAE | 5.917 | |
| cor | Age.bias | -0.982 |
rest.vars.list <- grep("forced|nifti|discrepancy", grep("Partial_corr_25_dim", names(df), value = T), invert = T, value = T)
rest.pred <- run_lasso(rest.vars.list)
test_results(rest.pred)
| value | ||
|---|---|---|
| cor | r | 0.437 |
| R^2 | 0.191 | |
| MAE | 5.282 | |
| cor | Age.bias | -0.927 |
pred.mat <- cbind(test_labels, t1.pred, t2.ols.pred, t2star.pred, diffusion.pred, task.pred, rest.pred)
colnames(pred.mat) <- c("Age", "T1-weighted","T2-FLAIR","T2*","Diffusion","Task fMRI","Resting-state fMRI")
# col1 <- colorRampPalette(brewer.pal(n = 10, name = "RdBu"))
col1 <- colorRampPalette(colors = c("firebrick", "white", "dodgerblue"))
cairo_pdf(filename = "~/Work/Articles/Brain age/UK Biobank multi-modal brain age/variable_corrplot.pdf")
corrplot(cor(pred.mat), type = "upper", diag = T, method = "color", col = col1(100),
addCoef.col = "black", tl.col = "black")
dev.off()
## quartz_off_screen
## 2
corrplot(cor(pred.mat), type = "upper", diag = T, method = "color", col = col1(100),
addCoef.col = "black", tl.col = "black")
Define non-healthy people as testing set.
non.healthy.df <- subset(df, df$healthy == "non-healthy")
table(complete.cases(non.healthy.df[,imaging.vars.list]))
##
## TRUE
## 14701
describe(non.healthy.df$age_at_scan)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 14701 62.64 7.45 63 62.78 8.9 45 80 35 -0.16 -0.88
## se
## X1 0.06
describeBy(non.healthy.df$age_at_scan, non.healthy.df$sex)
##
## Descriptive statistics by group
## group: Female
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 7914 61.87 7.33 62 61.93 8.9 45 80 35 -0.07 -0.89
## se
## X1 0.08
## --------------------------------------------------------
## group: Male
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 6787 63.54 7.49 65 63.8 7.41 45 80 35 -0.29 -0.82
## se
## X1 0.09
Scale new subjects variables using the scaling parameters from the training set.
scaled.non.healthy.test <- as.data.frame(scale(non.healthy.df[,imaging.vars.list], scaling.parameters.center, scaling.parameters.scale))
lasso.non.healthy.pred <- as.numeric(predict(lasso.fit, newx = as.matrix(scaled.non.healthy.test)))
non.healthy.labels <- non.healthy.df$age_at_scan
test_results2 <- function(pred, labels) {
r <- cor.test(labels, pred)$estimate
r.sq <- summary(lm(labels ~ pred))$r.squared
MAE <- mean(abs(pred - labels), na.rm = T)
age.bias <- cor.test(labels, (pred - labels))$estimate
value <- sapply(c(r,r.sq, MAE, age.bias), function(x) round(x, 3))
results <- cbind(c("r", "R^2", "MAE", "Age.bias"), value)
return(kable(results) %>% kable_styling())
}
test_results2(lasso.non.healthy.pred, non.healthy.labels)
| value | ||
|---|---|---|
| cor | r | 0.806 |
| R^2 | 0.649 | |
| MAE | 3.523 | |
| cor | Age.bias | -0.624 |
qplot(x = non.healthy.labels, y = lasso.non.healthy.pred) +
geom_abline(slope = 1, intercept = 0) + geom_point(shape = 21, bg = "darkgoldenrod2", size = 2) +
geom_smooth(method = "lm", col = "grey") +
xlab("Age (years)") +
theme_bw()
Calculate age bias in initial test data.
bias.model <- lm(lasso.pred ~ test_labels)
bias.model$coefficients[1]
## (Intercept)
## 24.74109
bias.model$coefficients[2]
## test_labels
## 0.5953419
Apply correction to new (non-healthy) test data. Subtract the intercept and then divide by the slope
lasso.non.healthy.pred.corrected <- (lasso.non.healthy.pred - bias.model$coefficients[1]) / bias.model$coefficients[2]
test_results2(lasso.non.healthy.pred.corrected, non.healthy.labels)
| value | ||
|---|---|---|
| cor | r | 0.806 |
| R^2 | 0.649 | |
| MAE | 4.551 | |
| cor | Age.bias | 0.076 |
qplot(x = non.healthy.labels, y = lasso.non.healthy.pred.corrected) +
geom_abline(slope = 1, intercept = 0) + geom_point(shape = 21, bg = "darkgoldenrod2", size = 2) +
geom_smooth(method = "lm", col = "grey") +
xlab("Age (years)") +
theme_bw()
Using cowplot package
plot1 <- qplot(x = non.healthy.labels, y = lasso.non.healthy.pred) +
geom_abline(slope = 1, intercept = 0) + geom_point(shape = 21, bg = "darkgoldenrod2", size = 2) +
geom_smooth(method = "lm", col = "grey") +
xlab("Age (years)") +
ylab("Brain-predicted age (years)") +
theme_bw()
plot2 <- qplot(x = non.healthy.labels, y = lasso.non.healthy.pred.corrected) +
geom_abline(slope = 1, intercept = 0) + geom_point(shape = 21, bg = "#0ebdee", size = 2) +
geom_smooth(method = "lm", col = "grey") +
xlab("Age (years)") +
ylab("Bias-adjusted brain-predicted age (years)") +
theme_bw()
plot_grid(plot1, plot2, ncol = 2, labels = c("A", "B"))
ggsave("~/Work/Articles/Brain age/UK Biobank multi-modal brain age/bias_correction_scatterplot.pdf", plot_grid(plot1, plot2, ncol = 2, labels = c("A", "B")), useDingbats = FALSE, dpi = 100, height = 4, width = 8)
Define brain-PAD and look at descriptives
non.healthy.df$brainPAD <- lasso.non.healthy.pred.corrected - non.healthy.df$age_at_scan
describe(non.healthy.df$brainPAD)
## vars n mean sd median trimmed mad min max range skew
## X1 1 14701 0.6 5.81 0.36 0.47 5.57 -22.68 37.5 60.17 0.34
## kurtosis se
## X1 0.85 0.05
hist(non.healthy.df$brainPAD, breaks = 25, col = "darkgoldenrod2")
Decide what to use as covariates for brain-PAD
non.healthy.df$head_motion_tfmri <- non.healthy.df$mean_tfmri_head_motion_averaged_across_space_and_time_points_f25742_2_0
m1 <- (lm(brainPAD ~ poly(age_at_scan, 2, raw = F) +
sex +
height_f12144_2_0 +
volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 +
head_motion_tfmri,
data = non.healthy.df))
summary(m1)
##
## Call:
## lm(formula = brainPAD ~ poly(age_at_scan, 2, raw = F) + sex +
## height_f12144_2_0 + volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 +
## head_motion_tfmri, data = non.healthy.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.188 -3.708 -0.182 3.553 34.408
##
## Coefficients:
## Estimate
## (Intercept) -3.511041
## poly(age_at_scan, 2, raw = F)1 12.548318
## poly(age_at_scan, 2, raw = F)2 56.145772
## sexMale 0.864451
## height_f12144_2_0 -0.015664
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 2.819859
## head_motion_tfmri 18.808912
## Std. Error
## (Intercept) 1.623130
## poly(age_at_scan, 2, raw = F)1 5.932977
## poly(age_at_scan, 2, raw = F)2 5.638440
## sexMale 0.146429
## height_f12144_2_0 0.007719
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 0.511237
## head_motion_tfmri 0.739811
## t value
## (Intercept) -2.163
## poly(age_at_scan, 2, raw = F)1 2.115
## poly(age_at_scan, 2, raw = F)2 9.958
## sexMale 5.904
## height_f12144_2_0 -2.029
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 5.516
## head_motion_tfmri 25.424
## Pr(>|t|)
## (Intercept) 0.0305
## poly(age_at_scan, 2, raw = F)1 0.0344
## poly(age_at_scan, 2, raw = F)2 < 2e-16
## sexMale 3.64e-09
## height_f12144_2_0 0.0424
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 3.53e-08
## head_motion_tfmri < 2e-16
##
## (Intercept) *
## poly(age_at_scan, 2, raw = F)1 *
## poly(age_at_scan, 2, raw = F)2 ***
## sexMale ***
## height_f12144_2_0 *
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 ***
## head_motion_tfmri ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.631 on 14694 degrees of freedom
## Multiple R-squared: 0.06161, Adjusted R-squared: 0.06122
## F-statistic: 160.8 on 6 and 14694 DF, p-value: < 2.2e-16
lm.beta(m1)
##
## Call:
## lm(formula = brainPAD ~ poly(age_at_scan, 2, raw = F) + sex +
## height_f12144_2_0 + volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 +
## head_motion_tfmri, data = non.healthy.df)
##
## Standardized Coefficients::
## (Intercept)
## 0.00000000
## poly(age_at_scan, 2, raw = F)1
## 0.01780954
## poly(age_at_scan, 2, raw = F)2
## 0.07968640
## sexMale
## 0.07416026
## height_f12144_2_0
## -0.02520155
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0
## 0.05919336
## head_motion_tfmri
## 0.21139154
etasq(m1, partial = T)
## Partial eta^2
## poly(age_at_scan, 2, raw = F) 0.0069737324
## sex 0.0023662494
## height_f12144_2_0 0.0002801883
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 0.0020661992
## head_motion_tfmri 0.0421356278
## Residuals NA
Used hier.part package to define unique (i.e., independent) variance and joint (i.e., shared) variance in brain-PAD.
hp.res <- hier.part(non.healthy.df$brainPAD, non.healthy.df[c("age_at_scan","sex", "height_f12144_2_0", "volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0", "head_motion_tfmri")], gof = "Rsqu", barplot = F)
round(hp.res$gfs,3)
## [1] 0.000 0.006 0.002 0.000 0.001 0.051 0.007 0.006 0.007 0.052 0.006
## [12] 0.007 0.052 0.001 0.051 0.052 0.009 0.012 0.052 0.007 0.052 0.053
## [23] 0.009 0.053 0.055 0.052 0.013 0.053 0.055 0.053 0.055 0.055
round(hp.res$IJ,3)
## I
## age_at_scan 0.003
## sex 0.003
## height_f12144_2_0 0.001
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 0.002
## head_motion_tfmri 0.047
## J
## age_at_scan 0.003
## sex -0.001
## height_f12144_2_0 0.000
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 -0.001
## head_motion_tfmri 0.004
## Total
## age_at_scan 0.006
## sex 0.002
## height_f12144_2_0 0.000
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 0.001
## head_motion_tfmri 0.051
round(hp.res$I.perc,3)
## I
## age_at_scan 5.136
## sex 5.114
## height_f12144_2_0 1.100
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 3.498
## head_motion_tfmri 85.152
P-values are corrected using FDR for 18 different comparisons.
run_lm <- function(var, data) {
m1 <- lm(brainPAD ~ data[[var]] +
poly(age_at_scan, 2, raw = F) +
sex +
height_f12144_2_0 +
volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 +
head_motion_tfmri,
data = data)
y <- round(summary(m1)$coefficients[2,],3)
z <- round(p.adjust(summary(m1)$coefficients[2,4], method = "fdr", n = 18),5)
names(z) <- "corrected_p"
part.eta <- round(etasq(m1, partial = T)[1,1],4)
names(part.eta) <- "partial eta^2"
return(c(y,z,part.eta))
}
run_lm("diastolic_blood_pressure_automated_reading_f4079_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.0480 0.0050 9.4740 0.0000 0.0000
## partial eta^2
## 0.0074
run_lm("systolic_blood_pressure_automated_reading_f4080_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.0270 0.0030 9.4170 0.0000 0.0000
## partial eta^2
## 0.0073
run_lm("body_mass_index_bmi_f21001_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.005 0.013 0.360 0.719 1.000
## partial eta^2
## 0.000
run_lm("weight_f21002_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.0050 0.0040 1.1160 0.2640 1.0000
## partial eta^2
## 0.0001
run_lm("hip_circumference_f49_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.0100 0.0060 -1.6860 0.0920 1.0000
## partial eta^2
## 0.0002
Do not know coded as -1, Prefer not to answer coded as -3. These values need to be excluded.
with(subset(non.healthy.df, non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -1 & non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -3), table(diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0))
## diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0
## -1 -3 0 1
## 0 0 13747 836
run_lm("diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0", subset(non.healthy.df, non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -1 & non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -3))
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 2.0060 0.2040 9.8400 0.0000 0.0000
## partial eta^2
## 0.0066
TukeyHSD(aov(brainPAD ~ diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0, data = subset(non.healthy.df, non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -1 & non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -3)))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = brainPAD ~ diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0, data = subset(non.healthy.df, non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -1 & non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -3))
##
## $diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0
## diff lwr upr p adj
## 1-0 3.037828 2.635421 3.440235 0
Question What was your age when the stroke was first diagnosed? recoded to be presence/absence of stroke history. Do not know coded as -1, Prefer not to answer coded as -3. These values need to be excluded.
run_lm("stroke_history", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 2.6330 0.4010 6.5680 0.0000 0.0000
## partial eta^2
## 0.0029
TukeyHSD(aov(brainPAD ~ stroke_history, data = non.healthy.df))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = brainPAD ~ stroke_history, data = non.healthy.df)
##
## $stroke_history
## diff lwr upr p adj
## stroke-no stroke 3.309603 2.502345 4.116861 0
Do not know coded as -1, Prefer not to answer coded as -3. These values need to be excluded.
with(subset(non.healthy.df, non.healthy.df$facial_ageinguses_datacoding_100435_f1757_2_0 > 0), table(facial_ageinguses_datacoding_100435_f1757_2_0))
## facial_ageinguses_datacoding_100435_f1757_2_0
## 1 2 3
## 10577 157 2668
run_lm("facial_ageinguses_datacoding_100435_f1757_2_0", subset(non.healthy.df, non.healthy.df$facial_ageinguses_datacoding_100435_f1757_2_0 > 0))
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.5100 0.4540 -1.1240 0.2610 1.0000
## partial eta^2
## 0.0001
with(subset(non.healthy.df, non.healthy.df$facial_ageinguses_datacoding_100435_f1757_2_0 != -1 & non.healthy.df$facial_ageinguses_datacoding_100435_f1757_2_0 != -3), describeBy(brainPAD, as.factor(facial_ageinguses_datacoding_100435_f1757_2_0)))
##
## Descriptive statistics by group
## group: 1
## vars n mean sd median trimmed mad min max range skew
## X1 1 10577 0.53 5.8 0.28 0.39 5.57 -22.68 33.58 56.26 0.33
## kurtosis se
## X1 0.75 0.06
## --------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 157 0.29 6.02 -0.08 0.37 5.79 -17.04 18.9 35.94 -0.04 0.33
## se
## X1 0.48
## --------------------------------------------------------
## group: 3
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2668 0.6 5.79 0.38 0.48 5.6 -16.63 37.5 54.13 0.37 1.1
## se
## X1 0.11
Prefer not to answer coded as -3. These values need to be excluded.
table(non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0)
##
## -3 0 1 2
## 49 9006 5009 558
with(subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3), table(smoking_statususes_datacoding_90_f20116_2_0))
## smoking_statususes_datacoding_90_f20116_2_0
## 0 1 2
## 9006 5009 558
run_lm("smoking_statususes_datacoding_90_f20116_2_0", subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3))
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.8600 0.1000 8.5950 0.0000 0.0000
## partial eta^2
## 0.0068
TukeyHSD(aov(brainPAD ~ smoking_statususes_datacoding_90_f20116_2_0, data = subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3)))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = brainPAD ~ smoking_statususes_datacoding_90_f20116_2_0, data = subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3))
##
## $smoking_statususes_datacoding_90_f20116_2_0
## diff lwr upr p adj
## 1-0 1.1819351 0.9434489 1.420421 0.0000000
## 2-0 1.9863522 1.3960902 2.576614 0.0000000
## 2-1 0.8044171 0.2005712 1.408263 0.0051089
with(subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3), describeBy(brainPAD, smoking_statususes_datacoding_90_f20116_2_0))
##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 9006 0.11 5.68 -0.08 -0.01 5.45 -20.16 37.5 57.66 0.34 0.86
## se
## X1 0.06
## --------------------------------------------------------
## group: 1
## vars n mean sd median trimmed mad min max range skew
## X1 1 5009 1.29 5.91 1.03 1.15 5.71 -22.68 33.58 56.26 0.32
## kurtosis se
## X1 0.82 0.08
## --------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 558 2.1 6.06 1.89 1.91 5.71 -21.95 28.03 49.98 0.36 1.05
## se
## X1 0.26
Prefer not to answer coded as -3. These values need to be excluded.
run_lm("alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0", subset(non.healthy.df, non.healthy.df$alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0 != -3))
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.9740 0.1450 -6.7300 0.0000 0.0000
## partial eta^2
## 0.0089
TukeyHSD(aov(brainPAD ~ alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0, data = subset(non.healthy.df, non.healthy.df$alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0 != -3)))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = brainPAD ~ alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0, data = subset(non.healthy.df, non.healthy.df$alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0 != -3))
##
## $alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0
## diff lwr upr p adj
## 2-1 -1.11690006 -1.54163736 -0.69216276 0.0000000
## 3-1 -1.47167944 -1.89911078 -1.04424809 0.0000000
## 4-1 -1.69116020 -2.21225439 -1.17006602 0.0000000
## 5-1 -1.21355820 -1.75085954 -0.67625685 0.0000000
## 6-1 -0.98193559 -1.61939128 -0.34447989 0.0001656
## 3-2 -0.35477938 -0.72286179 0.01330303 0.0663959
## 4-2 -0.57426014 -1.04789107 -0.10062921 0.0072769
## 5-2 -0.09665814 -0.58806412 0.39474785 0.9934752
## 6-2 0.13496447 -0.46431582 0.73424476 0.9878232
## 4-3 -0.21948077 -0.69552912 0.25656759 0.7774077
## 5-3 0.25812124 -0.23561515 0.75185763 0.6707475
## 6-3 0.48974385 -0.11144884 1.09093654 0.1852536
## 5-4 0.47760201 -0.09912776 1.05433177 0.1705066
## 6-4 0.70922461 0.03819984 1.38024939 0.0311429
## 6-5 0.23162261 -0.45206433 0.91530955 0.9287997
with(subset(non.healthy.df, non.healthy.df$alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0 != -3), describeBy(brainPAD, alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0))
##
## Descriptive statistics by group
## group: 1
## vars n mean sd median trimmed mad min max range skew
## X1 1 2388 1.69 6.05 1.32 1.49 5.74 -20.16 28.79 48.95 0.38
## kurtosis se
## X1 0.6 0.12
## --------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 4081 0.58 5.62 0.37 0.49 5.36 -22.5 28.25 50.75 0.25 0.72
## se
## X1 0.09
## --------------------------------------------------------
## group: 3
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 3945 0.22 5.86 0.06 0.1 5.63 -22.68 37.5 60.17 0.42 1.45
## se
## X1 0.09
## --------------------------------------------------------
## group: 4
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1723 0 5.52 -0.22 -0.14 5.39 -16.4 22 38.4 0.29 0.26
## se
## X1 0.13
## --------------------------------------------------------
## group: 5
## vars n mean sd median trimmed mad min max range skew
## X1 1 1554 0.48 5.86 0.24 0.34 5.56 -19.32 26.17 45.49 0.32
## kurtosis se
## X1 0.77 0.15
## --------------------------------------------------------
## group: 6
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 929 0.71 5.82 0.39 0.61 5.67 -15.37 23.08 38.45 0.23 0.27
## se
## X1 0.19
Do not know coded as -1, Prefer not to answer coded as -3. These values need to be excluded.
run_lm("duration_of_moderate_activityuses_datacoding_100291_f894_2_0", subset(non.healthy.df, non.healthy.df$duration_of_moderate_activityuses_datacoding_100291_f894_2_0 != -3 & non.healthy.df$duration_of_moderate_activityuses_datacoding_100291_f894_2_0 != -1))
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.000 0.001 0.357 0.721 1.000
## partial eta^2
## 0.000
run_lm("duration_of_vigorous_activityuses_datacoding_100291_f914_2_0", subset(non.healthy.df, non.healthy.df$duration_of_vigorous_activityuses_datacoding_100291_f914_2_0 != -3 & non.healthy.df$duration_of_vigorous_activityuses_datacoding_100291_f914_2_0 != -1))
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.001 0.001 -0.402 0.687 1.000
## partial eta^2
## 0.000
Fluid intelligence, Trail-making task, Matrix pattern completion, Tower rearranging.
run_lm("fluid_intelligence_score_f20016_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.1420 0.0240 -5.9190 0.0000 0.0000
## partial eta^2
## 0.0026
## Trail making
run_lm("duration_to_complete_numeric_path_trail_1uses_datacoding_1990_f6348_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.00200 0.00100 2.57400 0.01000 0.18124
## partial eta^2
## 0.00110
run_lm("duration_to_complete_alphanumeric_path_trail_2uses_datacoding_1990_f6350_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.0020 0.0000 5.4190 0.0000 0.0000
## partial eta^2
## 0.0049
## Matrix pattern completion
run_lm("number_of_puzzles_correctly_solved_f6373_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.217 0.036 -5.973 0.000 0.000
## partial eta^2
## 0.006
run_lm("duration_spent_answering_each_puzzle_f6333_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.0090 0.0070 1.2980 0.1940 1.0000
## partial eta^2
## 0.0003
## Tower rearranging
run_lm("number_of_puzzles_correct_f6382_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.115 0.021 -5.473 0.000 0.000
## partial eta^2
## 0.005